{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Tensorflow的feature_column的语法演示"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "feature_column是data到model的桥梁，实现原始数据到Model所需要的格式。\n",
    "\n",
    "内容回顾：\n",
    "tf.feature_column包含如下方法：\n",
    "* 数值列（Numeric column）：tf.feature_column.numeric_column\n",
    "* 分桶列（Bucketized column）：tf.feature_column.bucketized_column\n",
    "* 分类标识列(Categorical identity column)：tf.feature_column.categorical_column_with_identity\n",
    "* 分类词汇列(Categorical vocabulary column)：tf.feature_column.categorical_column_with_vocabulary_list\n",
    "* 哈希处理的列(Hashed Column)：tf.feature_column.categorical_column_with_hash_bucket\n",
    "* 组合列(Crossed column)：tf.feature_column.crossed_column\n",
    "* 指标列(Indicator Column)：tf.feature_column.indicator_column\n",
    "* 嵌入列(Embedding Column)：tf.feature_column.embedding_column\n",
    "* 共享Embedding：tf.feature_column.shared_embeddings\n",
    "* 加权分类特征：tf.feature_column.weighted_categorical_column"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import tensorflow as tf\n",
    "from tensorflow import feature_column\n",
    "from tensorflow.keras import layers\n",
    "\n",
    "tf.keras.backend.set_floatx('float32')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 准备dict数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = {\n",
    "    'marks': [55,21,63,88,74,54,95,41,84,52],\n",
    "    'grade': ['average','poor','average','good','good','average','good','average','good','average'],\n",
    "    'point': ['c','f','c+','b+','b','c','a','d+','b+','c'],\n",
    "    'weight': [0.10, 0.11, 0.12, 0.13, 0.14, 0.15, 0.16, 0.17, 0.18, 0.19]\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "def demo(feature_column):\n",
    "    \"\"\"常用函数，整合data>feature_column>layer，并输出Layer结果\"\"\"\n",
    "    feature_layer = layers.DenseFeatures(feature_column)\n",
    "    print(feature_layer(data).numpy())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 1. Numeric columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[55, 21, 63, 88, 74, 54, 95, 41, 84, 52]"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data[\"marks\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[55.]\n",
      " [21.]\n",
      " [63.]\n",
      " [88.]\n",
      " [74.]\n",
      " [54.]\n",
      " [95.]\n",
      " [41.]\n",
      " [84.]\n",
      " [52.]]\n"
     ]
    }
   ],
   "source": [
    "marks = feature_column.numeric_column(\"marks\")\n",
    "demo(marks)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 2. Bucketized columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[55, 21, 63, 88, 74, 54, 95, 41, 84, 52]"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data[\"marks\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "NumericColumn(key='marks', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None)"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "marks"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[0. 0. 0. 1. 0. 0. 0. 0.]\n",
      " [1. 0. 0. 0. 0. 0. 0. 0.]\n",
      " [0. 0. 0. 0. 1. 0. 0. 0.]\n",
      " [0. 0. 0. 0. 0. 0. 1. 0.]\n",
      " [0. 0. 0. 0. 0. 1. 0. 0.]\n",
      " [0. 0. 0. 1. 0. 0. 0. 0.]\n",
      " [0. 0. 0. 0. 0. 0. 0. 1.]\n",
      " [0. 0. 1. 0. 0. 0. 0. 0.]\n",
      " [0. 0. 0. 0. 0. 0. 1. 0.]\n",
      " [0. 0. 0. 1. 0. 0. 0. 0.]]\n"
     ]
    }
   ],
   "source": [
    "marks_buckets = feature_column.bucketized_column(\n",
    "    marks, boundaries=[30,40,50,60,70,80,90])\n",
    "demo(marks_buckets)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 3. Indicator columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['average',\n",
       " 'poor',\n",
       " 'average',\n",
       " 'good',\n",
       " 'good',\n",
       " 'average',\n",
       " 'good',\n",
       " 'average',\n",
       " 'good',\n",
       " 'average']"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data[\"grade\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "VocabularyListCategoricalColumn(key='grade', vocabulary_list=('poor', 'average', 'good'), dtype=tf.string, default_value=-1, num_oov_buckets=0)"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "grade = feature_column.categorical_column_with_vocabulary_list(\n",
    "      'grade', ['poor', 'average', 'good'])\n",
    "grade"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[0. 1. 0.]\n",
      " [1. 0. 0.]\n",
      " [0. 1. 0.]\n",
      " [0. 0. 1.]\n",
      " [0. 0. 1.]\n",
      " [0. 1. 0.]\n",
      " [0. 0. 1.]\n",
      " [0. 1. 0.]\n",
      " [0. 0. 1.]\n",
      " [0. 1. 0.]]\n"
     ]
    }
   ],
   "source": [
    "grade_one_hot = feature_column.indicator_column(grade)\n",
    "demo(grade_one_hot)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 4. Embedding columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['c', 'f', 'c+', 'b+', 'b', 'c', 'a', 'd+', 'b+', 'c']"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data[\"point\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array(['c', 'f', 'c+', 'b+', 'b', 'a', 'd+'], dtype=object)"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "point_unique = pd.Series(data[\"point\"]).unique()\n",
    "point_unique"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "point = feature_column.categorical_column_with_vocabulary_list(\n",
    "    \"point\", list(point_unique))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[1. 0. 0. 0. 0. 0. 0.]\n",
      " [0. 1. 0. 0. 0. 0. 0.]\n",
      " [0. 0. 1. 0. 0. 0. 0.]\n",
      " [0. 0. 0. 1. 0. 0. 0.]\n",
      " [0. 0. 0. 0. 1. 0. 0.]\n",
      " [1. 0. 0. 0. 0. 0. 0.]\n",
      " [0. 0. 0. 0. 0. 1. 0.]\n",
      " [0. 0. 0. 0. 0. 0. 1.]\n",
      " [0. 0. 0. 1. 0. 0. 0.]\n",
      " [1. 0. 0. 0. 0. 0. 0.]]\n"
     ]
    }
   ],
   "source": [
    "# categorical_column变成one-hot\n",
    "point_one_hot = feature_column.indicator_column(point)\n",
    "demo(point_one_hot)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[-0.0037021   0.3708323  -0.5864364 ]\n",
      " [-0.09577064 -0.70465165 -0.64716125]\n",
      " [-0.08357485  0.19009906 -0.8715153 ]\n",
      " [ 0.4796768   1.0348917   0.0362111 ]\n",
      " [ 0.38492042  0.05351927 -0.00737479]\n",
      " [-0.0037021   0.3708323  -0.5864364 ]\n",
      " [-0.76010287  0.27449778  0.8116004 ]\n",
      " [-0.7493177   0.5960407   0.9389187 ]\n",
      " [ 0.4796768   1.0348917   0.0362111 ]\n",
      " [-0.0037021   0.3708323  -0.5864364 ]]\n"
     ]
    }
   ],
   "source": [
    "# categorical_column变成embedding\n",
    "point_embedding = feature_column.embedding_column(point, dimension=3)\n",
    "demo(point_embedding)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 5. Hashed feature columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['c', 'f', 'c+', 'b+', 'b', 'c', 'a', 'd+', 'b+', 'c']"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data[\"point\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[0. 1. 0.]\n",
      " [1. 0. 0.]\n",
      " [1. 0. 0.]\n",
      " [1. 0. 0.]\n",
      " [0. 1. 0.]\n",
      " [0. 1. 0.]\n",
      " [1. 0. 0.]\n",
      " [0. 1. 0.]\n",
      " [1. 0. 0.]\n",
      " [0. 1. 0.]]\n"
     ]
    }
   ],
   "source": [
    "point_hashed = feature_column.categorical_column_with_hash_bucket(\n",
    "      'point', hash_bucket_size=3)\n",
    "demo(feature_column.indicator_column(point_hashed))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 6. Crossed feature columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "BucketizedColumn(source_column=NumericColumn(key='marks', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), boundaries=(30, 40, 50, 60, 70, 80, 90))"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "marks_buckets"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "VocabularyListCategoricalColumn(key='grade', vocabulary_list=('poor', 'average', 'good'), dtype=tf.string, default_value=-1, num_oov_buckets=0)"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "grade"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]\n",
      " [0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]\n",
      " [0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]\n",
      " [0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]\n",
      " [1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]\n",
      " [0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]\n",
      " [0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]\n",
      " [0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]\n",
      " [0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]\n",
      " [0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]]\n"
     ]
    }
   ],
   "source": [
    "crossed_feature = feature_column.crossed_column(\n",
    "    [marks_buckets, grade], hash_bucket_size=10)\n",
    "demo(feature_column.indicator_column(crossed_feature))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 7. Weighted Categorical Columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "VocabularyListCategoricalColumn(key='point', vocabulary_list=('c', 'f', 'c+', 'b+', 'b', 'a', 'd+'), dtype=tf.string, default_value=-1, num_oov_buckets=0)"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "point"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "WARNING:tensorflow:From /root/miniconda3/lib/python3.7/site-packages/tensorflow/python/feature_column/feature_column_v2.py:4366: sparse_merge (from tensorflow.python.ops.sparse_ops) is deprecated and will be removed in a future version.\n",
      "Instructions for updating:\n",
      "No similar op available at this time.\n",
      "[[0.1  0.   0.   0.   0.   0.   0.  ]\n",
      " [0.   0.11 0.   0.   0.   0.   0.  ]\n",
      " [0.   0.   0.12 0.   0.   0.   0.  ]\n",
      " [0.   0.   0.   0.13 0.   0.   0.  ]\n",
      " [0.   0.   0.   0.   0.14 0.   0.  ]\n",
      " [0.15 0.   0.   0.   0.   0.   0.  ]\n",
      " [0.   0.   0.   0.   0.   0.16 0.  ]\n",
      " [0.   0.   0.   0.   0.   0.   0.17]\n",
      " [0.   0.   0.   0.18 0.   0.   0.  ]\n",
      " [0.19 0.   0.   0.   0.   0.   0.  ]]\n"
     ]
    }
   ],
   "source": [
    "weight_categorical_column = feature_column.weighted_categorical_column(\n",
    "    point, 'weight')\n",
    "\n",
    "demo(feature_column.indicator_column(weight_categorical_column))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
